string[] transcript   # candidate words of speech-to-text API
float32[] confidence  # confidence of transcript